Load Packages

library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(Metrics)
library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)
library(corrplot)
## corrplot 0.92 loaded
library(plotly)
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(caTools)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(class)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(e1071)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
#library(DescTools)

Data

rm(list=ls())       # Remove anything that might currently be in memory so we can start fresh.
library(MASS)       # Load the MASS package
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
BostonHousing <- read.csv("Boston.csv")
colnames(BostonHousing) <- c("CrimePerCapita","Residential","NonRetail","CharlesRiver","N_Oxides",
                  "RoomsPerDwelling","Pre40HouseAge","BusinessCenterProximity",
                 "HighwayProximity","Tax","PupilPerTeacher","BlackIndex",
                 "LowStatus","HomeValue" )
print(head(BostonHousing))
##   CrimePerCapita Residential NonRetail CharlesRiver N_Oxides RoomsPerDwelling
## 1        0.00632          18      2.31            0    0.538            6.575
## 2        0.02731           0      7.07            0    0.469            6.421
## 3        0.02729           0      7.07            0    0.469            7.185
## 4        0.03237           0      2.18            0    0.458            6.998
## 5        0.06905           0      2.18            0    0.458            7.147
## 6        0.02985           0      2.18            0    0.458            6.430
##   Pre40HouseAge BusinessCenterProximity HighwayProximity Tax PupilPerTeacher
## 1          65.2                  4.0900                1 296            15.3
## 2          78.9                  4.9671                2 242            17.8
## 3          61.1                  4.9671                2 242            17.8
## 4          45.8                  6.0622                3 222            18.7
## 5          54.2                  6.0622                3 222            18.7
## 6          58.7                  6.0622                3 222            18.7
##   BlackIndex LowStatus HomeValue
## 1     396.90      4.98      24.0
## 2     396.90      9.14      21.6
## 3     392.83      4.03      34.7
## 4     394.63      2.94      33.4
## 5     396.90      5.33      36.2
## 6     394.12      5.21      28.7

Data Description

This data is taken from the Boston Standard Metropolitan Statistical Area (SMSA) in 1970. It was uploaded to Kaggle.com by Patrick Parsa.

str(BostonHousing)
## 'data.frame':    506 obs. of  14 variables:
##  $ CrimePerCapita         : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ Residential            : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ NonRetail              : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ CharlesRiver           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ N_Oxides               : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ RoomsPerDwelling       : num  6.58 6.42 7.18 7 7.15 ...
##  $ Pre40HouseAge          : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ BusinessCenterProximity: num  4.09 4.97 4.97 6.06 6.06 ...
##  $ HighwayProximity       : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ Tax                    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ PupilPerTeacher        : num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ BlackIndex             : num  397 397 393 395 397 ...
##  $ LowStatus              : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ HomeValue              : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
stargazer(BostonHousing, type = 'text', title='BostonHousing: Summary Statistics')
## 
## BostonHousing: Summary Statistics
## ===========================================================
## Statistic                N   Mean   St. Dev.  Min     Max  
## -----------------------------------------------------------
## CrimePerCapita          506  3.614   8.602   0.006  88.976 
## Residential             506 11.364   23.322  0.000  100.000
## NonRetail               506 11.137   6.860   0.460  27.740 
## CharlesRiver            506  0.069   0.254     0       1   
## N_Oxides                506  0.555   0.116   0.385   0.871 
## RoomsPerDwelling        506  6.285   0.703   3.561   8.780 
## Pre40HouseAge           506 68.575   28.149  2.900  100.000
## BusinessCenterProximity 506  3.795   2.106   1.130  12.126 
## HighwayProximity        506  9.549   8.707     1      24   
## Tax                     506 408.237 168.537   187     711  
## PupilPerTeacher         506 18.456   2.165   12.600 22.000 
## BlackIndex              506 356.674  91.295  0.320  396.900
## LowStatus               506 12.653   7.141   1.730  37.970 
## HomeValue               506 22.533   9.197   5.000  50.000 
## -----------------------------------------------------------

There are 506 rows and 14 columns. Each row represents a different town or suburb observed at the time by the SMSA. Each column represents different locality characteristics within each town deemed noteworthy by the SMSA. These columns would represent the variables of the dataset. All the column have numeric values with a few columns containing pure integers as a representation of categorical variables. We are going to build a model to predict the median home value within these localities.

Target Variable

HomeValue is our target variable. HomeValue describes the median value of the homes in each town or suburb observed within this dataset. Every other variable is considered an independent variable that will help in predicting the HomeValue in our model. All values in HomeValue are recorded in units of $1000.

Independent Variables

With HomeValue as our target variable, we are left with 13 independent variables:

CrimePerCapita : The crime per capita by town
Residential : The proportion of residential land zoned for lots over 25,000 square feet.
NonRetail : The proportion of non-retail business acres per town.
CharlesRiver : This is a dummy variable to record whether an area of land is bounded by the Charles River. 1 for yes, 0 for no.
N_Oxides : The nitric oxide concentration recorded in parts per 10 million.
RoomsPerDwelling : The average rooms per home for the town.
Pre40HouseAge : The proportion of owner-occupied units built prior to 1940.
BusinessCenterProximity : The weighted distances to five Boston employment centers.
HighwayProximity : Index of accessibility to highways leading to or from an urban center.
Tax : The full-value property tax rate per $10,000.
PupilPerTeacher : The student-teacher ratio by town.
BlackIndex : The proportion of black people by town.
LowStatus : The percentage of lower status of the population.

Exploratory Data Analysis

Distribution of HomeValue

# Visualizing our dependent variable y(HomeValue)

ggplot(BostonHousing, aes(x = HomeValue)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The graph of HomeValue is slightly right skewed.

Distribution of CrimePerCapita

ggplot(BostonHousing, aes(x = CrimePerCapita)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ CrimePerCapita

ggplot (BostonHousing, aes (x = CrimePerCapita, y = HomeValue)) +
  geom_point()

The distribution of CrimePerCapita is heavily right-skewed. When plotting HomeValue with respect to CrimePerCapita, we can see a negative correlation. As CrimePerCapita increases, the HomeValue tends to decrease.

Distribution of Residential

ggplot(BostonHousing, aes(x = Residential)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ Residential

ggplot (BostonHousing, aes (x = Residential, y = HomeValue)) +
  geom_point()

The distribution of Residential is right-skewed. When plotting HomeValue with respect to Residential, we observe a generally positive correlation, such that as the Residential proportion increases, the HomeValue increases as well.

Distribution of NonRetail

ggplot(BostonHousing, aes(x = NonRetail)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ NonRetail

ggplot (BostonHousing, aes (x = NonRetail, y = HomeValue)) +
  geom_point()

The distribution of NonRetail is slightly left-skewed. When plotting HomeValue with respect to NonRetail, we observe a negative correlation such that as NonRetail proportion increases, HomeValue decreases.

Distribution of CharlesRiver

ggplot(BostonHousing, aes(x = CharlesRiver)) + geom_bar(color ="black",fill="black")

HomeValue ~ CharlesRiver

ggplot (BostonHousing, aes (x = CharlesRiver, y = HomeValue)) +
  geom_point()

The distribution of CharlesRiver is heavily right-skewed with the majority of data points having a value of ‘0’.

Distribution of N_Oxides

ggplot(BostonHousing, aes(x = N_Oxides)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ N_Oxides

ggplot (BostonHousing, aes (x = N_Oxides, y = HomeValue)) +
  geom_point()

The distribution of N_OXides is right-skewed. When plotting HomeValue with respect to N_Oxides, we can observe a negative correlation such that as N_OXides increases, HomeValue decreases. This relationship is semi-intuitive since most individuals would prefer not to live in an area that is more heavily polluted.

Distribution of RoomsPerDwelling

ggplot(BostonHousing, aes(x = RoomsPerDwelling)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ RoomsPerDwelling

ggplot (BostonHousing, aes (x = RoomsPerDwelling, y = HomeValue)) +
  geom_point()

The distribution of RoomsPerDwelling appears relatively normal. Looking at this distribution, the average number of rooms within these localities range between 5 to 7. When plotting HomeValue with respect to RoomsPerDwelling, we observe a positive correlation such that as RoomsPerDwelling increases, HomeValue increases as well. Most people would prefer houses with more rooms, however those typically would cost more.

Distribution of Pre40HouseAge

ggplot(BostonHousing, aes(x = Pre40HouseAge)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ Pre40HouseAge

ggplot (BostonHousing, aes (x = Pre40HouseAge, y = HomeValue)) +
  geom_point()

The distribution of Pre40HouseAge is left-skewed. Intuitively, older houses cost less than newer houses. Thus we can observe a slight negative correlation when plotting HomeValue with respect to Pre40HouseAge.

Distribution of BusinessCenterProximity

ggplot(BostonHousing, aes(x = BusinessCenterProximity)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ BusinessCenterProximity

ggplot (BostonHousing, aes (x = BusinessCenterProximity, y = HomeValue)) +
  geom_point()

The distribution of BusinessCenterProximity is right-skewed.Typically, the closer you are to a business center, the smaller the size of your house would be. This may be a reason why HomeValue is lower, the closer you are to a BusinessCenter. In contrast, the further away you are, the more land you are able to buy, thus the HomeValue would be greater to a certain extent.

Distribution of HighwayProximity

# potentially remove.
ggplot(BostonHousing, aes(x = HighwayProximity)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ HighwayProximity

ggplot (BostonHousing, aes (x = HighwayProximity, y = HomeValue)) +
  geom_point()

Distribution of Tax

ggplot(BostonHousing, aes(x = Tax)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ Tax

ggplot (BostonHousing, aes (x = Tax, y = HomeValue)) +
  geom_point()

The distribution of Tax is left-skewed. When plotting HomeValue with respect to Tax, we observe a slight negative correlation, such that as Property Tax rate goes up, HomeValue decreases.

Distribution of PupilPerTeacher

ggplot(BostonHousing, aes(x = PupilPerTeacher)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ PupilPerTeacher

ggplot (BostonHousing, aes (x = PupilPerTeacher, y = HomeValue)) +
  geom_point()

The distribution of PupilPerTeacher is left-skewed. When plotting HomeValue with respect to PupilPerTeacher, we observe a negative correlation such that as PupilPerTeacher increases, HomeValue decreases. House prices typically are very driven by the quality of schools within the area. Good schools usually have low pupil-teacher ratio, thus this negative correlation makes sense.

Distribution of BlackIndex

ggplot(BostonHousing, aes(x = BlackIndex)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Home Value ~ BlackIndex

ggplot (BostonHousing, aes (x = BlackIndex, y = HomeValue)) +
  geom_point()

The distribution of BlackIndex is left-skewed. When plotting HomeValue with respect to BlackIndex, we observe a positive correlation, such that as BlackIndex increases, HomeValue increases as well.

Distribution of LowStatus

ggplot(BostonHousing, aes(x = LowStatus)) + geom_histogram(color ="black",fill="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

HomeValue ~ LowStatus

ggplot (BostonHousing, aes (x = LowStatus, y = HomeValue)) +
  geom_point()

The distribution of LowStatus is slightly right-skewed. When plotting HomeValue with respect to LowStatus, we observe a negative correlation, such that as LowStatus percentage increases, HomeValue decreases.

Correlations between independent variables

cor_vars<-BostonHousing[,c("CrimePerCapita","Residential","NonRetail","CharlesRiver","N_Oxides",
                  "RoomsPerDwelling","Pre40HouseAge","BusinessCenterProximity",
                 "HighwayProximity","Tax","PupilPerTeacher","BlackIndex",
                 "LowStatus","HomeValue" )]
cor(cor_vars)
##                         CrimePerCapita Residential   NonRetail CharlesRiver
## CrimePerCapita              1.00000000 -0.20046922  0.40658341 -0.055891582
## Residential                -0.20046922  1.00000000 -0.53382819 -0.042696719
## NonRetail                   0.40658341 -0.53382819  1.00000000  0.062938027
## CharlesRiver               -0.05589158 -0.04269672  0.06293803  1.000000000
## N_Oxides                    0.42097171 -0.51660371  0.76365145  0.091202807
## RoomsPerDwelling           -0.21924670  0.31199059 -0.39167585  0.091251225
## Pre40HouseAge               0.35273425 -0.56953734  0.64477851  0.086517774
## BusinessCenterProximity    -0.37967009  0.66440822 -0.70802699 -0.099175780
## HighwayProximity            0.62550515 -0.31194783  0.59512927 -0.007368241
## Tax                         0.58276431 -0.31456332  0.72076018 -0.035586518
## PupilPerTeacher             0.28994558 -0.39167855  0.38324756 -0.121515174
## BlackIndex                 -0.38506394  0.17552032 -0.35697654  0.048788485
## LowStatus                   0.45562148 -0.41299457  0.60379972 -0.053929298
## HomeValue                  -0.38830461  0.36044534 -0.48372516  0.175260177
##                            N_Oxides RoomsPerDwelling Pre40HouseAge
## CrimePerCapita           0.42097171      -0.21924670    0.35273425
## Residential             -0.51660371       0.31199059   -0.56953734
## NonRetail                0.76365145      -0.39167585    0.64477851
## CharlesRiver             0.09120281       0.09125123    0.08651777
## N_Oxides                 1.00000000      -0.30218819    0.73147010
## RoomsPerDwelling        -0.30218819       1.00000000   -0.24026493
## Pre40HouseAge            0.73147010      -0.24026493    1.00000000
## BusinessCenterProximity -0.76923011       0.20524621   -0.74788054
## HighwayProximity         0.61144056      -0.20984667    0.45602245
## Tax                      0.66802320      -0.29204783    0.50645559
## PupilPerTeacher          0.18893268      -0.35550149    0.26151501
## BlackIndex              -0.38005064       0.12806864   -0.27353398
## LowStatus                0.59087892      -0.61380827    0.60233853
## HomeValue               -0.42732077       0.69535995   -0.37695457
##                         BusinessCenterProximity HighwayProximity         Tax
## CrimePerCapita                      -0.37967009      0.625505145  0.58276431
## Residential                          0.66440822     -0.311947826 -0.31456332
## NonRetail                           -0.70802699      0.595129275  0.72076018
## CharlesRiver                        -0.09917578     -0.007368241 -0.03558652
## N_Oxides                            -0.76923011      0.611440563  0.66802320
## RoomsPerDwelling                     0.20524621     -0.209846668 -0.29204783
## Pre40HouseAge                       -0.74788054      0.456022452  0.50645559
## BusinessCenterProximity              1.00000000     -0.494587930 -0.53443158
## HighwayProximity                    -0.49458793      1.000000000  0.91022819
## Tax                                 -0.53443158      0.910228189  1.00000000
## PupilPerTeacher                     -0.23247054      0.464741179  0.46085304
## BlackIndex                           0.29151167     -0.444412816 -0.44180801
## LowStatus                           -0.49699583      0.488676335  0.54399341
## HomeValue                            0.24992873     -0.381626231 -0.46853593
##                         PupilPerTeacher  BlackIndex  LowStatus  HomeValue
## CrimePerCapita                0.2899456 -0.38506394  0.4556215 -0.3883046
## Residential                  -0.3916785  0.17552032 -0.4129946  0.3604453
## NonRetail                     0.3832476 -0.35697654  0.6037997 -0.4837252
## CharlesRiver                 -0.1215152  0.04878848 -0.0539293  0.1752602
## N_Oxides                      0.1889327 -0.38005064  0.5908789 -0.4273208
## RoomsPerDwelling             -0.3555015  0.12806864 -0.6138083  0.6953599
## Pre40HouseAge                 0.2615150 -0.27353398  0.6023385 -0.3769546
## BusinessCenterProximity      -0.2324705  0.29151167 -0.4969958  0.2499287
## HighwayProximity              0.4647412 -0.44441282  0.4886763 -0.3816262
## Tax                           0.4608530 -0.44180801  0.5439934 -0.4685359
## PupilPerTeacher               1.0000000 -0.17738330  0.3740443 -0.5077867
## BlackIndex                   -0.1773833  1.00000000 -0.3660869  0.3334608
## LowStatus                     0.3740443 -0.36608690  1.0000000 -0.7376627
## HomeValue                    -0.5077867  0.33346082 -0.7376627  1.0000000
trans<-cor(cor_vars)
melted_cormat <- melt(trans)

Visualize the correlation between all variables on a heat map

corrplot(cor(dplyr::select(BostonHousing, -CharlesRiver,-BlackIndex)))

HomeValue decreases with increase in CrimePerCapita (Medium),NonRetail(High), N_Oxides(Low), Pre40HouseAge(Low), HighwayProximity(Low), Tax(Low), PupilPerTeacher(High), and LowStatus(High). HomeValue increases with increase in Residential(Low) and RoomsPerDwelling(High)

Visualize the effect of variables on HomeValue

BostonHousing %>%
  dplyr::select(c(CrimePerCapita, RoomsPerDwelling, Pre40HouseAge, HighwayProximity, Tax, LowStatus, HomeValue, NonRetail, N_Oxides, PupilPerTeacher, Residential)) %>%
  melt(id.vars = 'HomeValue') %>%
  ggplot(aes(x = value, y = HomeValue, color = variable)) +
  geom_point(alpha = 0.7) +
  stat_smooth(aes(color = 'black')) +
  facet_wrap(~variable, scales = 'free', ncol = 2) + 
  labs(x = 'Variable Value', y = 'Median House Price ($1000s)') +
  theme_minimal()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at -0.5
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 13
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 2.9038e-31
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 156.25
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## -0.5
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius 13
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 2.9038e-31
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 156.25

The plots from the graph shows same correlation with the heat map.

Scaling the data

mean(BostonHousing$HomeValue)
## [1] 22.53281
sd(BostonHousing$HomeValue)
## [1] 9.197104

Our mean should be close to zero and standard deviation close to 1. We will need to scale the data.

BostonHousing <- scale(BostonHousing,center = TRUE, scale = TRUE)
colMeans(BostonHousing)  # faster version of apply(scaled.dat, 2, mean)
##          CrimePerCapita             Residential               NonRetail 
##           -6.899468e-18            2.298337e-17            1.516683e-17 
##            CharlesRiver                N_Oxides        RoomsPerDwelling 
##           -3.510587e-18           -2.149412e-16           -1.058524e-16 
##           Pre40HouseAge BusinessCenterProximity        HighwayProximity 
##           -1.645039e-16            1.144506e-16            4.651527e-17 
##                     Tax         PupilPerTeacher              BlackIndex 
##            1.906139e-17           -3.931034e-16           -1.155991e-16 
##               LowStatus               HomeValue 
##           -7.012260e-17           -1.379311e-16
apply(BostonHousing, 2, sd)
##          CrimePerCapita             Residential               NonRetail 
##                       1                       1                       1 
##            CharlesRiver                N_Oxides        RoomsPerDwelling 
##                       1                       1                       1 
##           Pre40HouseAge BusinessCenterProximity        HighwayProximity 
##                       1                       1                       1 
##                     Tax         PupilPerTeacher              BlackIndex 
##                       1                       1                       1 
##               LowStatus               HomeValue 
##                       1                       1
BostonHousing <- as.data.frame(BostonHousing)
mean(BostonHousing$HomeValue)
## [1] -1.374631e-16
sd(BostonHousing$HomeValue)
## [1] 1

Splitting the data into training and testing set.

#make this example reproducible
set.seed(1)

#use 70% of dataset as training set and 30% as test set
BH<- sample(c(TRUE, FALSE), nrow(BostonHousing), replace=TRUE, prob=c(0.7,0.3))
TrainSet  <- BostonHousing[BH, ]
TestSet   <- BostonHousing[!BH, ]
dim(TrainSet)
## [1] 360  14
dim(TestSet)
## [1] 146  14

Model 1: Multiple Linear Regression with all variables.

ModelLR1<-lm(HomeValue ~ CrimePerCapita + Residential + NonRetail + CharlesRiver + N_Oxides + RoomsPerDwelling + 
              Pre40HouseAge + BusinessCenterProximity + HighwayProximity + Tax + PupilPerTeacher + BlackIndex + LowStatus
              , data = TrainSet)
summary(ModelLR1)
## 
## Call:
## lm(formula = HomeValue ~ CrimePerCapita + Residential + NonRetail + 
##     CharlesRiver + N_Oxides + RoomsPerDwelling + Pre40HouseAge + 
##     BusinessCenterProximity + HighwayProximity + Tax + PupilPerTeacher + 
##     BlackIndex + LowStatus, data = TrainSet)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.70161 -0.29204 -0.05476  0.18336  2.97775 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.02980    0.02685  -1.110 0.267787    
## CrimePerCapita          -0.12613    0.03837  -3.287 0.001117 ** 
## Residential              0.13466    0.03954   3.405 0.000738 ***
## NonRetail                0.05211    0.05361   0.972 0.331675    
## CharlesRiver             0.03430    0.02723   1.260 0.208589    
## N_Oxides                -0.20725    0.05641  -3.674 0.000277 ***
## RoomsPerDwelling         0.34335    0.03934   8.728  < 2e-16 ***
## Pre40HouseAge            0.02398    0.04939   0.486 0.627586    
## BusinessCenterProximity -0.31349    0.05144  -6.095 2.92e-09 ***
## HighwayProximity         0.29367    0.07419   3.958 9.17e-05 ***
## Tax                     -0.25623    0.08311  -3.083 0.002212 ** 
## PupilPerTeacher         -0.20690    0.03576  -5.786 1.61e-08 ***
## BlackIndex               0.12153    0.03236   3.755 0.000203 ***
## LowStatus               -0.38857    0.04849  -8.014 1.72e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5054 on 346 degrees of freedom
## Multiple R-squared:  0.7546, Adjusted R-squared:  0.7454 
## F-statistic: 81.84 on 13 and 346 DF,  p-value: < 2.2e-16

Predicting the performance of out model on testing data set

predictedHomeValueLR1 <- predict(ModelLR1, TestSet)
TestSet['Predicted.HomeValueLR1'] <- predictedHomeValueLR1

pl1 <- TestSet %>%
  ggplot(aes(HomeValue, Predicted.HomeValueLR1)) +
  geom_point(alpha = 0.5) +
  stat_smooth(aes(color = 'black')) +
  xlab('Actual Home Value') +
  ylab('Predicted Home Value')+
  theme_bw()

ggplotly(pl1)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Assessing error rate: RMSE of our model

# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

if(!require('MLmetrics')) {
  install.packages('MLmetrics')
  library('MLmetrics')
}
## Loading required package: MLmetrics
## 
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following object is masked from 'package:base':
## 
##     Recall
mse = MSE(TestSet$HomeValue, predictedHomeValueLR1)
mae = MAE(TestSet$HomeValue, predictedHomeValueLR1)
rmse = RMSE(TestSet$HomeValue, predictedHomeValueLR1)
r2 = R2(TestSet$HomeValue, predictedHomeValueLR1, form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.3693966 
##  MSE: 0.3117648 
##  RMSE: 0.558359 
##  R-squared: 0.6252161

R-squared on TestSet for this model is 0.62, which shows the level of correlation between HomeValue and other variables.

RMSE score for our multiple regression model is 0.55, which is not that bad. Ideally, it should be between 0.2 to 0.5.

So, we will create a new model by eliminating the variables with P-values that are not statistically significant, i.e., Residential, NonRetail, CharlesRiver, Pre40HouseAge, Tax, and BlackIndex. By looking at plot of effect of variables on HomeValue, we can further eliminate PupilPerTeacher, and N_Oxides.

Model2: Multiple Linear Regression using feature extraction variables:

CrimePerCapita, RoomsPerDwelling, BusinessCenterProximity, HighwayProximity, and LowStatus that have statistically significant p-values.

ModelLR2<-lm(HomeValue ~ CrimePerCapita + RoomsPerDwelling + BusinessCenterProximity + HighwayProximity + LowStatus
              , data = TrainSet)
summary(ModelLR2)
## 
## Call:
## lm(formula = HomeValue ~ CrimePerCapita + RoomsPerDwelling + 
##     BusinessCenterProximity + HighwayProximity + LowStatus, data = TrainSet)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0479 -0.3305 -0.0871  0.1849  3.1890 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.02661    0.03008  -0.885 0.376870    
## CrimePerCapita          -0.11065    0.04271  -2.591 0.009971 ** 
## RoomsPerDwelling         0.42981    0.04035  10.651  < 2e-16 ***
## BusinessCenterProximity -0.12017    0.03569  -3.367 0.000843 ***
## HighwayProximity        -0.07191    0.04218  -1.705 0.089120 .  
## LowStatus               -0.47413    0.04821  -9.835  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5681 on 354 degrees of freedom
## Multiple R-squared:  0.6828, Adjusted R-squared:  0.6783 
## F-statistic: 152.4 on 5 and 354 DF,  p-value: < 2.2e-16

Predicting the performance of out model on testing data set

predictedHomeValue_LR2 <- predict(ModelLR2, TestSet)
TestSet['Predicted.HomeValueLR2'] <- predictedHomeValue_LR2


pl2 <- TestSet %>%
  ggplot(aes(HomeValue, x=Predicted.HomeValueLR2)) +
  geom_point(alpha = 0.5) +
  stat_smooth(aes(color = 'black')) +
  xlab('Actual Home Value') +
  ylab('Predicted Home Value')+
  theme_bw()

ggplotly(pl2)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

We can see there a few outliers at HomeValue = 50.

Assessing error rate: RMSE of our model

# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

mse = MSE(TestSet$HomeValue, predictedHomeValue_LR2)
mae = MAE(TestSet$HomeValue, predictedHomeValue_LR2)
rmse = RMSE(TestSet$HomeValue, predictedHomeValue_LR2)
r2 = R2(TestSet$HomeValue, predictedHomeValue_LR2, form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.4376023 
##  MSE: 0.4028163 
##  RMSE: 0.6346781 
##  R-squared: 0.4958962

The R-squared value for this model is 0.49 which which is worse than our previous model.

RMSE value is 0.63 which is also more than the previous model1. Model 2 obviously does not work.

Model 3:

So, to further improve the performance we can remove the outliers at HomeValue = 50, and eliminate features like BusinessCenterProximity and HighwayProximity. We will also create an interaction term between LowStatus and RoomsPerDwelling and see if it improves our adjusted R-squared.

df <- subset(BostonHousing, HomeValue != 50)

df['LowStatusC'] <- df$LowStatus - mean(df$LowStatus)
df['RoomsPerDwellingC'] <- df$RoomsPerDwelling - mean(df$RoomsPerDwelling)

df['RMLStat'] <- df$RoomsPerDwellingC * df$LowStatusC 


#Split Data into TrainSet and TestSet

#make this example reproducible
set.seed(1)

#use 70% of dataset as training set and 30% as test set
BH2<- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
TrainSet2  <- df[BH2, ]
TestSet2   <- df[!BH2, ]
dim(TrainSet2)
## [1] 360  17
dim(TestSet2)
## [1] 146  17
ModelLR3<-lm(HomeValue ~ CrimePerCapita + RoomsPerDwelling + LowStatus + RMLStat
              , data = TrainSet2)
summary(ModelLR3)
## 
## Call:
## lm(formula = HomeValue ~ CrimePerCapita + RoomsPerDwelling + 
##     LowStatus + RMLStat, data = TrainSet2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5382 -0.2703 -0.0604  0.2077  3.6385 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -0.17881    0.02905  -6.154 2.04e-09 ***
## CrimePerCapita   -0.11228    0.03202  -3.507 0.000512 ***
## RoomsPerDwelling  0.31032    0.03618   8.577 3.06e-16 ***
## LowStatus        -0.57864    0.03952 -14.641  < 2e-16 ***
## RMLStat          -0.25880    0.02241 -11.549  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4919 on 355 degrees of freedom
## Multiple R-squared:  0.7615, Adjusted R-squared:  0.7588 
## F-statistic: 283.4 on 4 and 355 DF,  p-value: < 2.2e-16
predictedHomeValue_LR3 <- predict(ModelLR3, TestSet2)
TestSet2['Predicted.HomeValueLR3'] <- predictedHomeValue_LR3


pl3 <- TestSet2 %>%
  ggplot(aes(HomeValue, x=Predicted.HomeValueLR3)) +
  geom_point(alpha = 0.5) +
  stat_smooth(aes(color = 'black')) +
  xlab('Actual Home Value') +
  ylab('Predicted Home Value')+
  theme_bw()

ggplotly(pl3)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## x_plotlyDomain, y_plotlyDomain
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

mse = MSE(TestSet2$HomeValue, predictedHomeValue_LR3)
mae = MAE(TestSet2$HomeValue, predictedHomeValue_LR3)
rmse = RMSE(TestSet2$HomeValue, predictedHomeValue_LR3)
r2 = R2(TestSet2$HomeValue, predictedHomeValue_LR3, form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.3454339 
##  MSE: 0.2801432 
##  RMSE: 0.5292855 
##  R-squared: 0.5946973

The RMSE score for interaction model after removing the outliers has improved from 0.55 in Model 1 to 0.52 but our R2 has has gotten worse from .62 to 0.59, which is not good.Our Best Model so far is Model 1 with RMSE: 0.558359 and R-squared: 0.6252161. We will further analyse using different models to see if we can improve on this.

Comparing models on their performance on the Train Set using Stargazer

stargazer(ModelLR1, ModelLR2, ModelLR3, title = "Comparing three Linear Regression Models", align = TRUE, type = 'text')
## 
## Comparing three Linear Regression Models
## ==================================================================================================
##                                                    Dependent variable:                            
##                         --------------------------------------------------------------------------
##                                                         HomeValue                                 
##                                   (1)                      (2)                      (3)           
## --------------------------------------------------------------------------------------------------
## CrimePerCapita                 -0.126***                -0.111***                -0.112***        
##                                 (0.038)                  (0.043)                  (0.032)         
##                                                                                                   
## Residential                     0.135***                                                          
##                                 (0.040)                                                           
##                                                                                                   
## NonRetail                        0.052                                                            
##                                 (0.054)                                                           
##                                                                                                   
## CharlesRiver                     0.034                                                            
##                                 (0.027)                                                           
##                                                                                                   
## N_Oxides                       -0.207***                                                          
##                                 (0.056)                                                           
##                                                                                                   
## RoomsPerDwelling                0.343***                 0.430***                 0.310***        
##                                 (0.039)                  (0.040)                  (0.036)         
##                                                                                                   
## Pre40HouseAge                    0.024                                                            
##                                 (0.049)                                                           
##                                                                                                   
## BusinessCenterProximity        -0.313***                -0.120***                                 
##                                 (0.051)                  (0.036)                                  
##                                                                                                   
## HighwayProximity                0.294***                 -0.072*                                  
##                                 (0.074)                  (0.042)                                  
##                                                                                                   
## Tax                            -0.256***                                                          
##                                 (0.083)                                                           
##                                                                                                   
## PupilPerTeacher                -0.207***                                                          
##                                 (0.036)                                                           
##                                                                                                   
## BlackIndex                      0.122***                                                          
##                                 (0.032)                                                           
##                                                                                                   
## LowStatus                      -0.389***                -0.474***                -0.579***        
##                                 (0.048)                  (0.048)                  (0.040)         
##                                                                                                   
## RMLStat                                                                          -0.259***        
##                                                                                   (0.022)         
##                                                                                                   
## Constant                         -0.030                   -0.027                 -0.179***        
##                                 (0.027)                  (0.030)                  (0.029)         
##                                                                                                   
## --------------------------------------------------------------------------------------------------
## Observations                      360                      360                      360           
## R2                               0.755                    0.683                    0.762          
## Adjusted R2                      0.745                    0.678                    0.759          
## Residual Std. Error         0.505 (df = 346)         0.568 (df = 354)         0.492 (df = 355)    
## F Statistic             81.838*** (df = 13; 346) 152.385*** (df = 5; 354) 283.412*** (df = 4; 355)
## ==================================================================================================
## Note:                                                                  *p<0.1; **p<0.05; ***p<0.01

We can see from the table, adjusted R-squared value is highest in the interaction model However, our evaluation on Test Set says otherwise.

Model 4: Regression using SVM

#Model SVM
#Fit SVR model and visualize using scatter plot
#Regression with SVM

ModelSVM <- svm(HomeValue~., data=TrainSet)
summary(ModelSVM)
## 
## Call:
## svm(formula = HomeValue ~ ., data = TrainSet)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.07692308 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  247
#Predict using SVM regression
PreditedHomeValueSVM = predict(ModelSVM, TestSet)

#Overlay SVM Predictions on Scatter Plot

x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueSVM, lwd="1", col="blue")

# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

mse = MSE(TestSet$HomeValue, PreditedHomeValueSVM)
mae = MAE(TestSet$HomeValue, PreditedHomeValueSVM)
rmse = RMSE(TestSet$HomeValue, PreditedHomeValueSVM)
r2 = R2(TestSet$HomeValue, PreditedHomeValueSVM, form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.2514065 
##  MSE: 0.1814117 
##  RMSE: 0.4259245 
##  R-squared: 0.7020166

Our second model for Regression is SVM. We fitted our training set on the SVM model and then checked the model performance on the test set. Our best performance so far has been Model 1 with RMSE: 0.558359 and R-squared: 0.6252161. The performance from ModelSVM is RMSE: 0.4259245 and R-squared: 0.70201661, which is much better.

Model 5: SVM using Parameter Tuning

## Tuning SVR model by varying values of maximum allowable error and cost parameter

#Tune the SVM model
OptModelSVM=tune(svm, HomeValue~., data=TrainSet,ranges=list(epsilon=seq(0,1,0.1), cost=1:20))

#Print optimum value of parameters
print(OptModelSVM)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  epsilon cost
##      0.1    8
## 
## - best performance: 0.1655382
#Plot the perfrormance of SVM Regression model
plot(OptModelSVM)

The OptModelsvm has value of epsilon and cost at 0 and 8 respectively. The plot above visualizes the performance of each of the model. The legend on the right displays the value of Mean Square Error (MSE). MSE is defined as (RMSE)2 and is also a performance indicator.

## Select the best model and compute RMSE and R2

#Find out the best model
BestModelSVM=OptModelSVM$best.model

#Predict HomeValue using best model
PreditedHomeValueSVMBest=predict(BestModelSVM,TestSet)

#Calculate RMSE of the best model 
rmseSVMBest = RMSE(TestSet$HomeValue, PreditedHomeValueSVMBest)

r2SVMBest = R2(TestSet$HomeValue, PreditedHomeValueSVMBest, form = "traditional")

cat( "RMSE:", rmseSVMBest, "\n", 
     "R-squared:", r2SVMBest)
## RMSE: 0.3232587 
##  R-squared: 0.8674123
#Overlay Predictions on Scatter Plot

x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueSVMBest, lwd="1", col="blue")

Our best performance so far has been The performance from Model 4: ModelSVM which gave us RMSE: 0.4259245 and R-squared: 0.70201661.

We further tuned our Hyperparameters for SVM and evaluated the performance for every combination of maximum allowable error of 0, 1 and 0.1 and cost parameter 1 to 20. This gave us the best model at epsilon of 0 and cost of 8 which gave us RMSE: 0.3232587 and R-squared: 0.8674123. The best model reduced our RMSE from 0.42 to 0.32 and increased our R square from 0.70 to 0.867 giving us the best model performance so far. We can clearly see from the 3 overlay graphs that the best model gives us the best fit.

Model 6: Decision Tree

# Model Decision Tree

#visualization

ModelDT <- rpart(HomeValue~., data=TrainSet)
summary(ModelDT)
## Call:
## rpart(formula = HomeValue ~ ., data = TrainSet)
##   n= 360 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.46088001      0 1.0000000 1.0050557 0.09858698
## 2 0.19257920      1 0.5391200 0.7060990 0.07980596
## 3 0.05762849      2 0.3465408 0.5029066 0.06647727
## 4 0.04155313      3 0.2889123 0.4333845 0.06153181
## 5 0.03628437      4 0.2473592 0.3920576 0.05691661
## 6 0.01771724      5 0.2110748 0.3141143 0.04706421
## 7 0.01689102      6 0.1933576 0.3024650 0.04701315
## 8 0.01118188      7 0.1764665 0.2767035 0.04356295
## 9 0.01000000      8 0.1652847 0.2722551 0.04224304
## 
## Variable importance
##               LowStatus        RoomsPerDwelling               NonRetail 
##                      26                      20                      12 
##                N_Oxides          CrimePerCapita             Residential 
##                      12                      12                       9 
##           Pre40HouseAge BusinessCenterProximity         PupilPerTeacher 
##                       3                       2                       2 
##              BlackIndex 
##                       1 
## 
## Node number 1: 360 observations,    complexity param=0.46088
##   mean=0.001869466, MSE=1.000486 
##   left son=2 (211 obs) right son=3 (149 obs)
##   Primary splits:
##       LowStatus        < -0.4254358  to the right, improve=0.4608800, (0 missing)
##       RoomsPerDwelling < 1.033088    to the left,  improve=0.4593216, (0 missing)
##       NonRetail        < -0.5702008  to the right, improve=0.2626432, (0 missing)
##       PupilPerTeacher  < 0.5517305   to the right, improve=0.2309303, (0 missing)
##       N_Oxides         < 0.8828701   to the right, improve=0.2157732, (0 missing)
##   Surrogate splits:
##       NonRetail        < -0.5118948  to the right, agree=0.817, adj=0.557, (0 split)
##       N_Oxides         < -0.3080409  to the right, agree=0.806, adj=0.530, (0 split)
##       RoomsPerDwelling < 0.183408    to the left,  agree=0.797, adj=0.510, (0 split)
##       CrimePerCapita   < -0.4049939  to the right, agree=0.786, adj=0.483, (0 split)
##       Residential      < 0.1559169   to the left,  agree=0.767, adj=0.436, (0 split)
## 
## Node number 2: 211 observations,    complexity param=0.05762849
##   mean=-0.5687563, MSE=0.2613397 
##   left son=4 (54 obs) right son=5 (157 obs)
##   Primary splits:
##       LowStatus               < 0.9210027   to the right, improve=0.3764114, (0 missing)
##       CrimePerCapita          < 0.3799476   to the right, improve=0.3574901, (0 missing)
##       BusinessCenterProximity < -0.883119   to the left,  improve=0.3407602, (0 missing)
##       N_Oxides                < 0.8828701   to the right, improve=0.2744873, (0 missing)
##       Tax                     < 0.9449719   to the right, improve=0.2522448, (0 missing)
##   Surrogate splits:
##       BusinessCenterProximity < -1.046912   to the left,  agree=0.834, adj=0.352, (0 split)
##       CrimePerCapita          < 0.6981747   to the right, agree=0.820, adj=0.296, (0 split)
##       Pre40HouseAge           < 1.075535    to the right, agree=0.806, adj=0.241, (0 split)
##       RoomsPerDwelling        < -1.048415   to the left,  agree=0.796, adj=0.204, (0 split)
##       BlackIndex              < -3.39213    to the left,  agree=0.763, adj=0.074, (0 split)
## 
## Node number 3: 149 observations,    complexity param=0.1925792
##   mean=0.8099368, MSE=0.9331197 
##   left son=6 (103 obs) right son=7 (46 obs)
##   Primary splits:
##       RoomsPerDwelling < 1.034512    to the left,  improve=0.4988837, (0 missing)
##       LowStatus        < -1.174624   to the right, improve=0.4182055, (0 missing)
##       PupilPerTeacher  < -1.711606   to the right, improve=0.2048885, (0 missing)
##       N_Oxides         < 0.1665976   to the left,  improve=0.1708625, (0 missing)
##       Pre40HouseAge    < 0.8055423   to the left,  improve=0.1474321, (0 missing)
##   Surrogate splits:
##       LowStatus       < -1.194929   to the right, agree=0.779, adj=0.283, (0 split)
##       PupilPerTeacher < -1.803987   to the right, agree=0.758, adj=0.217, (0 split)
##       Residential     < 3.264509    to the left,  agree=0.725, adj=0.109, (0 split)
##       NonRetail       < -1.389401   to the right, agree=0.725, adj=0.109, (0 split)
##       N_Oxides        < 0.7275339   to the left,  agree=0.718, adj=0.087, (0 split)
## 
## Node number 4: 54 observations,    complexity param=0.01118188
##   mean=-1.103551, MSE=0.1811411 
##   left son=8 (39 obs) right son=9 (15 obs)
##   Primary splits:
##       N_Oxides                < 0.4168615   to the right, improve=0.4117348, (0 missing)
##       Tax                     < 0.8500374   to the right, improve=0.3422643, (0 missing)
##       PupilPerTeacher         < 0.5517305   to the right, improve=0.3392276, (0 missing)
##       CrimePerCapita          < 0.463346    to the right, improve=0.3372936, (0 missing)
##       BusinessCenterProximity < -0.8860397  to the left,  improve=0.2934819, (0 missing)
##   Surrogate splits:
##       BusinessCenterProximity < -0.7596215  to the left,  agree=0.907, adj=0.667, (0 split)
##       Tax                     < -0.06667466 to the right, agree=0.907, adj=0.667, (0 split)
##       NonRetail               < 0.4676467   to the right, agree=0.870, adj=0.533, (0 split)
##       CrimePerCapita          < -0.2447477  to the right, agree=0.852, adj=0.467, (0 split)
##       PupilPerTeacher         < 0.5517305   to the right, agree=0.778, adj=0.200, (0 split)
## 
## Node number 5: 157 observations,    complexity param=0.01771724
##   mean=-0.3848141, MSE=0.156718 
##   left son=10 (65 obs) right son=11 (92 obs)
##   Primary splits:
##       LowStatus       < 0.3286538   to the right, improve=0.2593529, (0 missing)
##       CrimePerCapita  < 0.1214127   to the right, improve=0.1947878, (0 missing)
##       PupilPerTeacher < 0.4593494   to the right, improve=0.1654144, (0 missing)
##       Pre40HouseAge   < 0.5941661   to the right, improve=0.1591771, (0 missing)
##       BlackIndex      < -1.733219   to the left,  improve=0.1525752, (0 missing)
##   Surrogate splits:
##       Pre40HouseAge           < 0.8019898   to the right, agree=0.745, adj=0.385, (0 split)
##       CrimePerCapita          < 0.08039561  to the right, agree=0.688, adj=0.246, (0 split)
##       N_Oxides                < 0.5549381   to the right, agree=0.675, adj=0.215, (0 split)
##       BusinessCenterProximity < -0.8009852  to the left,  agree=0.669, adj=0.200, (0 split)
##       BlackIndex              < -0.3411367  to the left,  agree=0.656, adj=0.169, (0 split)
## 
## Node number 6: 103 observations,    complexity param=0.04155313
##   mean=0.3539748, MSE=0.3976233 
##   left son=12 (96 obs) right son=13 (7 obs)
##   Primary splits:
##       Pre40HouseAge           < 0.9067897   to the left,  improve=0.3654334, (0 missing)
##       LowStatus               < -1.122111   to the right, improve=0.3301865, (0 missing)
##       BusinessCenterProximity < -0.8418265  to the right, improve=0.3225409, (0 missing)
##       N_Oxides                < 0.2960444   to the left,  improve=0.2799519, (0 missing)
##       NonRetail               < 0.791974    to the left,  improve=0.2392294, (0 missing)
##   Surrogate splits:
##       BusinessCenterProximity < -0.9559211  to the right, agree=0.981, adj=0.714, (0 split)
##       N_Oxides                < 0.2960444   to the left,  agree=0.971, adj=0.571, (0 split)
##       CrimePerCapita          < -0.3030303  to the left,  agree=0.961, adj=0.429, (0 split)
##       NonRetail               < 0.791974    to the left,  agree=0.961, adj=0.429, (0 split)
##       HighwayProximity        < 0.7408293   to the left,  agree=0.942, adj=0.143, (0 split)
## 
## Node number 7: 46 observations,    complexity param=0.03628437
##   mean=1.830895, MSE=0.6242919 
##   left son=14 (23 obs) right son=15 (23 obs)
##   Primary splits:
##       RoomsPerDwelling < 1.640105    to the left,  improve=0.45507970, (0 missing)
##       LowStatus        < -1.213134   to the right, improve=0.42065130, (0 missing)
##       PupilPerTeacher  < -1.573034   to the right, improve=0.24842000, (0 missing)
##       Pre40HouseAge    < 0.5106814   to the left,  improve=0.19949120, (0 missing)
##       CrimePerCapita   < -0.3601142  to the left,  improve=0.09966569, (0 missing)
##   Surrogate splits:
##       LowStatus       < -1.213134   to the right, agree=0.761, adj=0.522, (0 split)
##       CrimePerCapita  < -0.4069924  to the left,  agree=0.696, adj=0.391, (0 split)
##       BlackIndex      < 0.3804811   to the right, agree=0.696, adj=0.391, (0 split)
##       PupilPerTeacher < -1.573034   to the right, agree=0.674, adj=0.348, (0 split)
##       Pre40HouseAge   < 0.4893661   to the left,  agree=0.652, adj=0.304, (0 split)
## 
## Node number 8: 39 observations
##   mean=-1.272919, MSE=0.1167287 
## 
## Node number 9: 15 observations
##   mean=-0.663195, MSE=0.08011764 
## 
## Node number 10: 65 observations
##   mean=-0.6246655, MSE=0.1230763 
## 
## Node number 11: 92 observations
##   mean=-0.2153539, MSE=0.1111246 
## 
## Node number 12: 96 observations,    complexity param=0.01689102
##   mean=0.2510421, MSE=0.1632389 
##   left son=24 (80 obs) right son=25 (16 obs)
##   Primary splits:
##       LowStatus        < -1.083601   to the right, improve=0.38821680, (0 missing)
##       RoomsPerDwelling < 0.4381698   to the left,  improve=0.36974710, (0 missing)
##       Tax              < -0.8350514  to the right, improve=0.12863220, (0 missing)
##       PupilPerTeacher  < 0.3207778   to the right, improve=0.11255210, (0 missing)
##       HighwayProximity < -0.8096011  to the left,  improve=0.08848849, (0 missing)
##   Surrogate splits:
##       RoomsPerDwelling < 0.6303086   to the left,  agree=0.875, adj=0.250, (0 split)
##       Residential      < 3.264509    to the left,  agree=0.854, adj=0.125, (0 split)
##       Tax              < -1.119855   to the right, agree=0.844, adj=0.062, (0 split)
## 
## Node number 13: 7 observations
##   mean=1.765623, MSE=1.473981 
## 
## Node number 14: 23 observations
##   mean=1.297882, MSE=0.1743252 
## 
## Node number 15: 23 observations
##   mean=2.363908, MSE=0.5060535 
## 
## Node number 24: 80 observations
##   mean=0.1384614, MSE=0.09443664 
## 
## Node number 25: 16 observations
##   mean=0.8139457, MSE=0.1270178
#Visualizing the Decision tree
rpart.plot(ModelDT)

#Predict using Decision Tree
PreditedHomeValueDT = predict(ModelDT, TestSet)

#Overlay Decision Tree Predictions on Scatter Plot

x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueDT, lwd="1", col="blue")

# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

mse = MSE(TestSet$HomeValue, PreditedHomeValueDT)
mae = MAE(TestSet$HomeValue, PreditedHomeValueDT)
rmse = RMSE(TestSet$HomeValue, PreditedHomeValueDT)
r2 = R2(TestSet$HomeValue, PreditedHomeValueDT, form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.3612928 
##  MSE: 0.2570004 
##  RMSE: 0.506952 
##  R-squared: 0.6261858

We further try to improve our performance using Decision Tree. However, it could not beat our best performance of SVM best Model.The best model has a low RMSE of 0.32 compared to 0.50 in the Decision Tree model and higher R square of 0.867 compared to 0.62 in the Decision Tree. SVM Best Model is still our best performer.

#check for important variables
ModelDT$variable.importance
##               LowStatus        RoomsPerDwelling               NonRetail 
##              225.639634              172.849836              108.569877 
##                N_Oxides          CrimePerCapita             Residential 
##              107.997657              101.341823               80.714827 
##           Pre40HouseAge BusinessCenterProximity         PupilPerTeacher 
##               26.395078               21.954655               20.429869 
##              BlackIndex                     Tax        HighwayProximity 
##                7.731267                3.065188                2.138057

Model 7: Decision Tree with best parameters using pruning

The regression tree can be pruned for better results. Pruning translates to trimming. Pruning overcomes the problem of overfitting. We will be trimming the leaf nodes, and this way reduce the size of the tree. Pruning a tree requires us to choose the best Complexity Parameter (CP) value. We can either refer to the CP table or explicitly ask for the best CP value. Best CP value has the lowest Cross validation error (xerror) value.

#Finding the best CP value
printcp(ModelDT) #Choose the best CP value based on the lowest xerror from the CP table.
## 
## Regression tree:
## rpart(formula = HomeValue ~ ., data = TrainSet)
## 
## Variables actually used in tree construction:
## [1] LowStatus        N_Oxides         Pre40HouseAge    RoomsPerDwelling
## 
## Root node error: 360.17/360 = 1.0005
## 
## n= 360 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.460880      0   1.00000 1.00506 0.098587
## 2 0.192579      1   0.53912 0.70610 0.079806
## 3 0.057628      2   0.34654 0.50291 0.066477
## 4 0.041553      3   0.28891 0.43338 0.061532
## 5 0.036284      4   0.24736 0.39206 0.056917
## 6 0.017717      5   0.21107 0.31411 0.047064
## 7 0.016891      6   0.19336 0.30246 0.047013
## 8 0.011182      7   0.17647 0.27670 0.043563
## 9 0.010000      8   0.16528 0.27226 0.042243
#From the CP table it is observed that 0.24181 is the lowest xerror value. 
#However, the number of splits (nsplit) is 8. This has the same number of splits as our original tree.
#We will get the same result as the nsplits will be the same for the original and pruned tree.
#Hence, we should look another low xerror value. 

#Prune the tree with best cp value (complexity parameter)
prunedtree <- prune(ModelDT, cp = 0.011182) 

#Visualizing the pruned tree
rpart.plot(prunedtree)

#Checking the order of variable importance
prunedtree$variable.importance
##               LowStatus        RoomsPerDwelling               NonRetail 
##             225.6396339             172.8498358             106.4219133 
##                N_Oxides          CrimePerCapita             Residential 
##             103.9702242              99.4623549              80.7148267 
##           Pre40HouseAge         PupilPerTeacher BusinessCenterProximity 
##              26.3950778              19.6243823              19.2697003 
##              BlackIndex        HighwayProximity                     Tax 
##               7.7312671               2.1380566               0.3802326
#performance of regression tree
predprune <- predict(prunedtree, TestSet)
# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

mse = MSE(TestSet$HomeValue, predprune )
mae = MAE(TestSet$HomeValue, predprune )
rmse = RMSE(TestSet$HomeValue, predprune )
r2 = R2(TestSet$HomeValue, predprune , form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.3760765 
##  MSE: 0.2818576 
##  RMSE: 0.5309026 
##  R-squared: 0.5823294

Pruning the tree didnt reduce our RMSE or increase R2, so our first tree is good. Sometimes pruning is effective. In our case, it wasn’t.

Model 8: Random Forest

#Random Forest
set.seed(123)
ModelRF <- randomForest(HomeValue~., data=TrainSet, proximity=TRUE) 
print(ModelRF)
## 
## Call:
##  randomForest(formula = HomeValue ~ ., data = TrainSet, proximity = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 0.1357942
##                     % Var explained: 86.43
summary(ModelRF)
##                 Length Class  Mode     
## call                 4 -none- call     
## type                 1 -none- character
## predicted          360 -none- numeric  
## mse                500 -none- numeric  
## rsq                500 -none- numeric  
## oob.times          360 -none- numeric  
## importance          13 -none- numeric  
## importanceSD         0 -none- NULL     
## localImportance      0 -none- NULL     
## proximity       129600 -none- numeric  
## ntree                1 -none- numeric  
## mtry                 1 -none- numeric  
## forest              11 -none- list     
## coefs                0 -none- NULL     
## y                  360 -none- numeric  
## test                 0 -none- NULL     
## inbag                0 -none- NULL     
## terms                3 terms  call
#Predict using Random Forest
PreditedHomeValueRF = predict(ModelRF, TestSet)

#Overlay Random Forest Predictions on Scatter Plot

x = 1:length(TestSet$HomeValue)
plot(x, TestSet$HomeValue, pch=18, col="red")
lines(x, PreditedHomeValueRF, lwd="1", col="blue")

# Finally, we'll check the prediction accuracy with the MSE, MAE, RMSE, and R-squared metrics.

mse = MSE(TestSet$HomeValue, PreditedHomeValueRF)
mae = MAE(TestSet$HomeValue, PreditedHomeValueRF)
rmse = RMSE(TestSet$HomeValue, PreditedHomeValueRF)
r2 = R2(TestSet$HomeValue, PreditedHomeValueRF, form = "traditional")

 
cat(" MAE:", mae, "\n", "MSE:", mse, "\n", 
     "RMSE:", rmse, "\n", "R-squared:", r2)
##  MAE: 0.2353101 
##  MSE: 0.1292226 
##  RMSE: 0.3594754 
##  R-squared: 0.810815

Random Forest didn’t improve our performance either.It could not beat our best performance of SVM best Model.The best model has a low RMSE of 0.32 compared to 0.359 in the Random Forest model and higher R square of 0.867 compared to .81 in the Random Forest. SVM Best Model is still our best performer. Although, this Model also gave us a really good performance.

Conclusion:

To Conclude we used 8 different models to give us the best prediction on the Median HomeValue of various localities in Boston. 3 in Linear Regression 2 in SVM 2 in Decision Tree 1 in Random Forest We found the best Model to be SVM using a value of epsilon and cost at 0 and 8 respectively. It gave us an RMSE of 0.32 and R2 of 0.87.